This report is meant to help explore the results of the derfinder2 package. While the report is rich, it is meant to just start the exploration of the results and exemplify some of the code used to do so. You will most likely need a more in-depth analysis for your specific data set.
suppressMessages(library("derfinder2"))
suppressMessages(library("IRanges"))
library("ggplot2")
suppressMessages(library("gridExtra"))
library("parallel")
suppressMessages(library("GenomicRanges"))
suppressMessages(library("ggbio"))
suppressMessages(library("TxDb.Hsapiens.UCSC.hg19.knownGene"))
## Set the prefix and number of cores if not present
if (!"prefix" %in% ls()) {
prefix <- "run1"
}
if (!"cores" %in% ls()) {
## Make sure you asked for enough cores in the cluster queue!
cores <- 4
}
if (!"makeBestPlots" %in% ls()) {
makeBestPlots <- TRUE
}
Using 1 cores on prefix run1 from path /amber2/scratch/lcollado/derfinderExample/derAnalysis.
## Load files
load(paste0(prefix, "/fullRegsAnno.Rdata"))
load(paste0(prefix, "/fullNullstats.Rdata"))
load(paste0(prefix, "/fullNullwidths.Rdata"))
load(paste0(prefix, "/fullTime.Rdata"))
load(paste0(prefix, "/fullRegionsDF.Rdata"))
load(paste0(prefix, "/chr1/optionsStats-1.Rdata"))
## Get chr lengths
data(hg19Ideogram, package = "biovizBase")
seqlengths(fullRegionsDF) <- seqlengths(hg19Ideogram)[names(seqlengths(fullRegionsDF))]
## Find which chrs are present in the data set
chrs <- levels(seqnames(fullRegionsDF))
names(chrs) <- gsub("chr", "", chrs)
Processing code.
### p-value functions Recursively figure out the pvalues
getPval <- function(df, vals, res, i, stop) {
## Figure out where to work
if (df$null[1] > vals[i]) {
j <- 1
} else {
j <- which(head(df$null, 20) > vals[i])[1]
if (is.na(j)) {
j <- which(head(df$null, 100) > vals[i])[1]
if (is.na(j)) {
j <- which(df$null > vals[i])[1]
if (is.na(j)) {
return(res)
}
}
}
}
res[i] <- df$n[j]
if (i == stop) {
return(res)
} else {
newdf <- df[j:nrow(df), ]
getPval(newdf, vals, res, i + 1, stop)
}
}
## Apply the recursive function by chunks
getPval.apply <- function(k, cuts, df, vals, df.n) {
start <- which(df$null >= cuts[k])[1]
newvals <- as.numeric(vals[vals >= cuts[k] & vals < cuts[k + 1]])
tmpres <- rep(0, length(newvals))
if (is.na(start)) {
return(tmpres)
} else {
end <- which(df$null <= cuts[k + 1])[1]
if (is.na(end)) {
end <- df.n
} else {
end <- min(1 + end, df.n)
}
newdf <- df[start:end, ]
res <- getPval(newdf, newvals, tmpres, 1, length(tmpres))
return(res)
}
}
## Should it re-process the files or load previous ones?
procFiles <- sapply(c(paste0(prefix, "/fullRegsAnnoPooled.Rdata"), paste0(prefix,
"/fullNullSummary.Rdata")), file.exists)
procFiles <- all(procFiles)
if (procFiles) {
## Just load the pre-saved files
load(paste0(prefix, "/fullRegsAnnoPooled.Rdata"))
load(paste0(prefix, "/fullNullSummary.Rdata"))
} else {
## Process the nullstats
nulls <- unlist(lapply(fullNullstats, as.numeric))
nullstats <- Rle(sort(nulls))
fullRegsAnno.ord <- fullRegsAnno[order(fullRegsAnno$value), ]
vals <- Rle(fullRegsAnno.ord$value)
## Construct null info
null.df <- data.frame(null = runValue(nullstats), n = length(nullstats) -
cumsum(runLength(nullstats)))
## Calculate pvalues using all null stats from all the chrs
nCuts <- ceiling(length(vals)/1000)
cuts <- quantile(vals, (0:nCuts)/nCuts)
cuts[length(cuts)] <- cuts[length(cuts)] * 1.1
df.n <- nrow(null.df)
pvals <- mclapply(seq_len(length(cuts) - 1), getPval.apply, cuts = cuts,
df = null.df, vals = vals, df.n = df.n, mc.cores = cores)
pvals <- do.call(c, pvals)
fullRegsAnno.ord$pvaluesPool <- (pvals + 1)/(length(nullstats) + 1)
fullRegsAnno$pvaluesPool <- fullRegsAnno.ord$pvaluesPool[order(fullRegsAnno$value)]
rm(nullstats, fullRegsAnno.ord, vals, null.df, nCuts, cuts, df.n, pvals)
## For Fstat vs width
widths <- unlist(lapply(fullNullwidths, as.numeric))
howMany <- unlist(lapply(fullNullstats, length))
fullNullSummary <- data.frame(nullstat = nulls, nullwidth = widths, chr = rep(paste0("chr",
names(fullNullstats)), howMany))
rm(nulls, widths, howMany)
fullNullSummary$area <- fullNullSummary$nullstat * fullNullSummary$nullwidth
## Area penalization scheme 1
fullRegsAnno$penArea1 <- fullRegsAnno$value * fullRegsAnno$L
tmp <- fullRegsAnno$L > 30
fullRegsAnno$penArea1[tmp] <- fullRegsAnno$value[tmp] * 30 + fullRegsAnno$value[tmp] *
sqrt(fullRegsAnno$L[tmp] - 30)
fullNullSummary$penArea1 <- fullNullSummary$nullstat * fullNullSummary$nullwidth
tmp <- fullNullSummary$nullwidth > 30
fullNullSummary$penArea1[tmp] <- fullNullSummary$nullstat[tmp] * 30 + fullNullSummary$nullstat[tmp] *
sqrt(fullNullSummary$nullwidth[tmp] - 30)
## Area penalization scheme 2
fullRegsAnno$penArea2 <- fullRegsAnno$value * fullRegsAnno$L
tmp <- fullRegsAnno$L > 30
fullRegsAnno$penArea2[tmp] <- fullRegsAnno$value[tmp] * 30 + fullRegsAnno$value[tmp] *
log(fullRegsAnno$L[tmp] - 30)
fullNullSummary$penArea2 <- fullNullSummary$nullstat * fullNullSummary$nullwidth
tmp <- fullNullSummary$nullwidth > 30
fullNullSummary$penArea2[tmp] <- fullNullSummary$nullstat[tmp] * 30 + fullNullSummary$nullstat[tmp] *
log(fullNullSummary$nullwidth[tmp] - 30)
rm(tmp)
}
## For genome overview plots -- needed before version 0.0.12
## fullRegionsDF$significant <- factor(fullRegionsDF$pvalues < 0.05,
## levels=c(TRUE, FALSE)) fullRegionsDF$significantQval <-
## factor(fullRegionsDF$qvalues < 0.10, levels=c(TRUE, FALSE))
fullRegionsDF$significantPool <- factor(fullRegsAnno$pvaluesPool < 0.05, levels = c(TRUE,
FALSE))
## Visual comparison
p1 <- ggplot(fullRegsAnno, aes(x = pvalues, colour = chr)) + geom_line(stat = "density") +
xlim(0, 1) + labs(title = "Density of p-values - by chr") + xlab("p-values") +
scale_colour_discrete(limits = chrs)
p2 <- ggplot(fullRegsAnno, aes(x = pvaluesPool, colour = chr)) + geom_line(stat = "density") +
xlim(0, 1) + labs(title = "Density of p-values - pooled") + xlab("p-values") +
scale_colour_discrete(limits = chrs)
grid.arrange(p1, p2)
This plot is useful for comparing the densities of the permutted p-values for the regions. The top panel shows the p-values computed by chromosome, meaning that the null statistics used were chromosome specific. The second panel shows the p-values computed by pooling the null statistics from all the chromosomes.
There are a total of 42445 regions and 8993 null statistics across all chromosomes.
## Compare the pvalues
summary(fullRegsAnno$pvalues)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0011 0.0564 0.2810 0.3530 0.6060 1.0000
summary(fullRegsAnno$pvaluesPool)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0001 0.0001 0.0001 0.0019 0.0001 1.0000
The previous output shows the summaries for the p-values distribution. First, by chr and next using by pooling.
summary(fullRegsAnno$qvalues)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0047 0.1270 0.3180 0.3120 0.4640 1.0000
This is the numerical summary of the distribution of the q-values.
## Compare them directly
hist(fullRegsAnno$pvalues - fullRegsAnno$pvaluesPool, freq = FALSE, col = "light blue",
main = "Difference in p-values", xlab = "p-value - pooled p-value")
lines(density(fullRegsAnno$pvalues - fullRegsAnno$pvaluesPool), col = "red")
This plot shows the paired differences between the by chr p-values and the pooled p-values.
summary(fullRegsAnno$pvalues - fullRegsAnno$pvaluesPool)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.9930 0.0553 0.2800 0.3510 0.6060 1.0000
This is a numerical summary of the differences.
p3 <- ggplot(fullRegsAnno, aes(x = log10(L), y = value, colour = chr, alpha = area)) +
geom_point() + labs(title = "F-stat vs width (regions)") + xlab("Region width (log10)") +
ylab("F-stat") + scale_colour_discrete(limits = chrs)
p4 <- ggplot(fullNullSummary, aes(x = log10(nullwidth), y = nullstat, colour = chr,
alpha = area)) + geom_point() + labs(title = "F-stat vs width (null regions)") +
xlab("Region width (log10)") + ylab("F-stat") + scale_colour_discrete(limits = chrs)
grid.arrange(p3, p4)
This plot shows the relationship between the F-statistics and the width (length) for the regions. The bottom panel is the same information but this time for the null regions.
p5a <- ggplot(fullRegsAnno, aes(x = log10(L), colour = chr)) + geom_line(stat = "density") +
labs(title = "Density of region lengths") + xlab("Region width (log10)") +
scale_colour_discrete(limits = chrs)
p5b <- ggplot(subset(fullRegsAnno, pvalues < 0.05), aes(x = log10(L), colour = chr)) +
geom_line(stat = "density") + labs(title = "Density of region lengths (significant only)") +
xlab("Region width (log10)") + scale_colour_discrete(limits = chrs)
grid.arrange(p5a, p5b)
This plot shows the density of the region lengths for the regions (all) and regions (significant only).
p7a <- ggplot(fullRegsAnno, aes(x = log10(area), colour = chr)) + geom_line(stat = "density") +
labs(title = "Density of region areas") + xlab("Region area (log10)") +
scale_colour_discrete(limits = chrs)
p7b <- ggplot(subset(fullRegsAnno, pvalues < 0.05), aes(x = log10(area), colour = chr)) +
geom_line(stat = "density") + labs(title = "Density of region areas (significant only)") +
xlab("Region area (log10)") + scale_colour_discrete(limits = chrs)
grid.arrange(p7a, p7b)
This plot shows the density of the region areas for the regions (all) and regions (significant only).
p6 <- ggplot(fullNullSummary, aes(x = log10(nullwidth), colour = chr)) + geom_line(stat = "density") +
labs(title = "Density of null region lengths") + xlab("Region width (log10)") +
scale_colour_discrete(limits = chrs)
p8 <- ggplot(fullNullSummary, aes(x = log10(area), colour = chr)) + geom_line(stat = "density") +
labs(title = "Density of null region areas") + xlab("Region area (log10)") +
scale_colour_discrete(limits = chrs)
grid.arrange(p6, p8)
This plot shows the density of the null region lengths and areas.
## Q-values dist
q1 <- ggplot(fullRegsAnno, aes(x = qvalues, colour = chr)) + geom_line(stat = "density") +
xlim(0, 1) + labs(title = "Density of q-values") + xlab("q-values") + scale_colour_discrete(limits = chrs)
q1
This plot shows the distribution of the q-values.
q2 <- ggplot(fullRegsAnno, aes(x = log10(L), colour = chr)) + geom_line(stat = "density") +
labs(title = "Density of region lengths") + xlab("Region width (log10)") +
scale_colour_discrete(limits = chrs)
sub <- subset(fullRegsAnno, qvalues < 0.1)
if (nrow(sub) > 0) {
q3 <- ggplot(sub, aes(x = log10(L), colour = chr)) + geom_line(stat = "density") +
labs(title = "Density of region lengths (qvalue significant only)") +
xlab("Region width (log10)") + scale_colour_discrete(limits = chrs)
grid.arrange(q2, q3)
} else {
q2
}
This plot shows the density of the region lengths for the regions (all) and regions (qvalue significant only, if any are present!). In this case significance is q-value < 0.10.
q4 <- ggplot(fullRegsAnno, aes(x = log10(area), colour = chr)) + geom_line(stat = "density") +
labs(title = "Density of region areas") + xlab("Region area (log10)") +
scale_colour_discrete(limits = chrs)
if (nrow(sub) > 0) {
q5 <- ggplot(subset(fullRegsAnno, qvalues < 0.1), aes(x = log10(area), colour = chr)) +
geom_line(stat = "density") + labs(title = "Density of region areas (qvalue significant only)") +
xlab("Region area (log10)") + scale_colour_discrete(limits = chrs)
grid.arrange(q4, q5)
} else {
q4
}
rm(sub)
This plot shows the density of the region areas for the regions (all) and regions (qvalue significant only, if any are present!). In this case significance is q-value < 0.10.
Below we present two alternative approaches for ranking regions based on the area and penalized by the width of the region.
p9 <- ggplot(fullRegsAnno, aes(x = log10(penArea1), colour = chr)) + geom_line(stat = "density") +
labs(title = "Density of penalized area (scheme 1)") + xlab("Penalized area with sqrt and width > 30 (log10)") +
scale_colour_discrete(limits = chrs)
p10 <- ggplot(fullNullSummary, aes(x = log10(penArea1), colour = chr)) + geom_line(stat = "density") +
labs(title = "Density of null penalized area (scheme 1)") + xlab("Penalized area with sqrt and width > 30 (log10)") +
scale_colour_discrete(limits = chrs)
grid.arrange(p9, p10)
This plot shows the densities for the penalized areas using:
\[ \begin{cases} Fstat * width \text{ if } width \leq 30 \\ Fstat * 30 + Fstat * \sqrt{ width - 30} & \text{ if } width > 30 \end{cases} \]
Below are the numerical summaries of the penalized areas (scheme 1) for the regions and the null regions in log10 scale.
summary(log10(fullRegsAnno$penArea1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.32 1.41 1.96 2.04 2.49 3.95
summary(log10(fullNullSummary$penArea1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.32 1.34 1.65 1.75 2.05 3.13
p11 <- ggplot(fullRegsAnno, aes(x = log10(penArea2), colour = chr)) + geom_line(stat = "density") +
labs(title = "Density of penalized area (scheme 2)") + xlab("Penalized area with log and width > 30 (log10)") +
scale_colour_discrete(limits = chrs)
p12 <- ggplot(fullNullSummary, aes(x = log10(penArea2), colour = chr)) + geom_line(stat = "density") +
labs(title = "Density of null penalized area (scheme 2)") + xlab("Penalized area with log and width > 30 (log10)") +
scale_colour_discrete(limits = chrs)
grid.arrange(p11, p12)
This plot shows the densities for the penalized areas using:
\[ \begin{cases} Fstat * width \text{ if } width \leq 30 \\ Fstat * 30 + Fstat * \log \left( width - 30 \right) & \text{ if } width > 30 \end{cases} \]
Below is are the numerical summaries of the penalized areas (scheme 2) for the regions and the null regions in log10 scale.
summary(log10(fullRegsAnno$penArea2))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.32 1.41 1.96 2.04 2.49 3.93
summary(log10(fullNullSummary$penArea2))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.32 1.34 1.65 1.75 2.05 3.09
plotOverview(regions = fullRegionsDF, type = "pval", base_size = 30, areaRel = 10,
legend.position = c(0.97, 0.12))
This plot shows the genomic locations of the regions found in the analysis. The significant regions (p-value less than 0.05) are highlighted and the area of the regions is shown on top of each chromosome. Note that the area is in a relative scale.
plotOverview(regions = fullRegionsDF, type = "qval", base_size = 30, areaRel = 10,
legend.position = c(0.97, 0.12))
This plot is similar to the previous one except that significance is now determined by a q-value less than 0.10.
## Graphical setup
ann_text <- data.frame(x = 2.25e+08, y = 10, lab = "Area", seqnames = "chrX")
ann_line <- data.frame(x = 2e+08, xend = 2.15e+08, y = 10, seqnames = "chrX")
## Make the plot ^^
autoplot(seqinfo(fullRegionsDF)) + layout_karyogram(fullRegionsDF, aes(fill = significantPool,
color = significantPool), geom = "rect") + layout_karyogram(fullRegionsDF,
aes(x = midpoint, y = area), geom = "line", color = "coral1", ylim = c(10,
20)) + labs(title = "Overview of regions found in the genome; significant: p-value <0.05 (pooled)") +
scale_colour_manual(values = c("chartreuse4", "wheat2")) + scale_fill_manual(values = c("chartreuse4",
"wheat2")) + geom_text(aes(x = x, y = y), data = ann_text, label = "Area",
size = rel(10)) + geom_segment(aes(x = x, xend = xend, y = y, yend = y),
data = ann_line, colour = "coral1") + xlab("Genomic coordinate") + theme(text = element_text(size = 30),
legend.background = element_blank(), legend.position = c(0.97, 0.12))
This is a very similar plot that uses the pooled p-values to determine significance (less than 0.05).
plotOverview(regions = fullRegionsDF, annotation = fullRegsAnno, type = "annotation",
base_size = 30, areaRel = 10, legend.position = c(0.97, 0.12))
This final genomic overview plot shows the annotation region type. Note that the regions are shown only if the information is available. Below is a table of the actual number of results per annotation region type.
table(fullRegsAnno$region, useNA = "always")
##
## upstream promoter overlaps 5' inside overlaps 3' close to 3'
## 2816 244 11 29501 13 104
## downstream covers <NA>
## 2948 0 6808
## Load coverage data
load("../fullCoverage/fullCov.Rdata")
## Graphical setup: ideograms
p.ideos <- lapply(chrs, function(xx) {
plotIdeogram(genome = "hg19", subchr = xx)
})
names(p.ideos) <- chrs
## Graphical setup: main plotting function
regionPlot <- function(idx, tUse = "pval") {
## Chr specific selections
chr <- as.character(seqnames(fullRegionsDF[idx]))
chrnum <- gsub("chr", "", chr)
p.ideo <- p.ideos[[chr]]
covInfo <- fullCov[[chrnum]]
## Make the plot
p <- plotRegion(idx, regions = fullRegionsDF, annotation = fullRegsAnno,
coverageInfo = covInfo, groupInfo = optionsStats$group, titleUse = tUse,
txdb = TxDb.Hsapiens.UCSC.hg19.knownGene, p.ideogram = p.ideo)
print(p)
rm(p.ideo, covInfo)
return(invisible(TRUE))
}
Below are the best 20 regions ordered by p-value. There are 10173 regions with p-value less than 0.05 and 7822 that also have a length greater than 10 base-pairs.
bestPval <- head(order(fullRegsAnno$pvalues), 20)
## Genome plots
for (idx in bestPval) {
regionPlot(idx)
}
## Output detail
fullRegsAnno[bestPval, ]
## chr start end value area cluster indexStart indexEnd L
## 3 chr1 14524 14568 44.01 1980.6 1 571 615 45
## 7 chr1 14652 14682 39.60 1227.5 1 699 729 31
## 8 chr1 14985 15020 38.47 1384.8 1 892 927 36
## 11 chr1 15834 15905 72.23 5200.7 1 984 1055 72
## 14 chr1 16921 17038 35.63 4203.9 1 1320 1437 118
## 24 chr1 19036 19066 44.15 1368.7 1 2165 2195 31
## 25 chr1 19738 19771 35.28 1199.4 1 2204 2237 34
## 26 chr1 24747 24786 42.61 1704.5 1 6265 6304 40
## 30 chr1 135068 135111 44.28 1948.4 1 10364 10407 44
## 35 chr1 135737 135759 33.28 765.3 1 11033 11055 23
## 36 chr1 135761 135777 33.88 576.0 1 11057 11073 17
## 47 chr1 137660 137679 33.60 671.9 1 12311 12330 20
## 54 chr1 138056 138083 38.50 1078.1 1 12683 12710 28
## 68 chr1 139387 139425 36.93 1440.3 1 13917 13955 39
## 81 chr1 324706 324747 33.67 1414.3 1 21897 21938 42
## 86 chr1 325740 325784 32.62 1467.7 1 22863 22907 45
## 97 chr1 327577 327592 34.01 544.2 1 24107 24122 16
## 98 chr1 327595 327613 34.21 650.0 1 24125 24143 19
## 106 chr1 328243 328284 46.96 1972.2 1 24771 24812 42
## 125 chr1 565244 565288 33.28 1497.5 1 29245 29289 45
## clusterL pvalues qvalues significant significantQval name
## 3 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 7 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 8 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 11 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 14 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 24 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 25 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 26 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 30 208851 0.001081 0.004675 TRUE TRUE LOC729737
## 35 208851 0.001081 0.004675 TRUE TRUE LOC729737
## 36 208851 0.001081 0.004675 TRUE TRUE LOC729737
## 47 208851 0.001081 0.004675 TRUE TRUE LOC729737
## 54 208851 0.001081 0.004675 TRUE TRUE LOC729737
## 68 208851 0.001081 0.004675 TRUE TRUE LOC729737
## 81 208851 0.001081 0.004675 TRUE TRUE LOC100133331
## 86 208851 0.001081 0.004675 TRUE TRUE LOC100133331
## 97 208851 0.001081 0.004675 TRUE TRUE LOC100132287
## 98 208851 0.001081 0.004675 TRUE TRUE LOC100132287
## 106 208851 0.001081 0.004675 TRUE TRUE LOC100132287
## 125 208851 0.001081 0.004675 TRUE TRUE OR4F29
## annotation description region distance
## 3 NR_024540 <NA> <NA> 0
## 7 NR_024540 <NA> <NA> 0
## 8 NR_024540 <NA> <NA> 0
## 11 NR_024540 <NA> <NA> 0
## 14 NR_024540 <NA> <NA> 0
## 24 NR_024540 <NA> <NA> 0
## 25 NR_024540 <NA> <NA> 0
## 26 NR_024540 <NA> <NA> 0
## 30 NR_039983 <NA> <NA> 0
## 35 NR_039983 <NA> <NA> 0
## 36 NR_039983 <NA> <NA> 0
## 47 NR_039983 <NA> <NA> 0
## 54 NR_039983 <NA> <NA> 0
## 68 NR_039983 <NA> <NA> 0
## 81 NR_028327 overlaps exon upstream inside 418
## 86 NR_028327 inside exon inside 1452
## 97 NR_028322 inside exon inside 3685
## 98 NR_028322 inside exon inside 3703
## 106 NR_028322 inside exon inside 4351
## 125 <NA> downstream downstream 56746
## subregion insidedistance exonnumber nexons
## 3 <NA> NA NA 7
## 7 <NA> NA NA 7
## 8 <NA> NA NA 7
## 11 <NA> NA NA 5
## 14 <NA> NA NA 7
## 24 <NA> NA NA 7
## 25 <NA> NA NA 7
## 26 <NA> NA NA 7
## 30 <NA> NA NA 5
## 35 <NA> NA NA 5
## 36 <NA> NA NA 5
## 47 <NA> NA NA 5
## 54 <NA> NA NA 5
## 68 <NA> NA NA 5
## 81 overlaps exon upstream 0 3 4
## 86 inside exon 0 4 4
## 97 inside exon 0 3 3
## 98 inside exon 0 3 3
## 106 inside exon 0 3 3
## 125 <NA> NA NA 1
## UTR strand geneL codingL chrnum pvaluesPool
## 3 <NA> - 0 0 1 0.0001112
## 7 <NA> - 0 0 1 0.0001112
## 8 <NA> - 0 0 1 0.0001112
## 11 <NA> - 0 0 1 0.0001112
## 14 <NA> - 0 0 1 0.0001112
## 24 <NA> - 0 0 1 0.0001112
## 25 <NA> - 0 0 1 0.0001112
## 26 <NA> - 0 0 1 0.0001112
## 30 <NA> - 0 0 1 0.0001112
## 35 <NA> - 0 0 1 0.0001112
## 36 <NA> - 0 0 1 0.0001112
## 47 <NA> - 0 0 1 0.0001112
## 54 <NA> - 0 0 1 0.0001112
## 68 <NA> - 0 0 1 0.0001112
## 81 inside transcription region + 1608 1090 1 0.0001112
## 86 3'UTR + 1608 1090 1 0.0001112
## 97 3'UTR + 4689 1262 1 0.0001112
## 98 3'UTR + 4689 1262 1 0.0001112
## 106 3'UTR + 4689 1262 1 0.0001112
## 125 <NA> - 938 938 1 0.0001112
## penArea1 penArea2
## 3 1490.9 1439.6
## 7 1227.5 1187.9
## 8 1248.2 1222.9
## 11 2635.1 2436.9
## 14 1403.0 1228.3
## 24 1368.7 1324.5
## 25 1128.9 1107.2
## 26 1413.1 1376.5
## 30 1494.2 1445.3
## 35 765.3 765.3
## 36 576.0 576.0
## 47 671.9 671.9
## 54 1078.1 1078.1
## 68 1218.7 1189.1
## 81 1126.8 1093.9
## 86 1104.8 1066.8
## 97 544.2 544.2
## 98 650.0 650.0
## 106 1571.4 1525.4
## 125 1127.2 1088.4
Below are the best 20 regions ordered by q-value. There are 9185 regions with q-value less than 0.10 and 7193 that also have a length greater than 10 base-pairs.
bestQval <- head(order(fullRegsAnno$qvalues), 20)
## Genome plots
for (idx in bestQval) {
regionPlot(idx, tUse = "qval")
}
## Output detail
fullRegsAnno[bestQval, ]
## chr start end value area cluster indexStart indexEnd L
## 3 chr1 14524 14568 44.01 1980.6 1 571 615 45
## 7 chr1 14652 14682 39.60 1227.5 1 699 729 31
## 8 chr1 14985 15020 38.47 1384.8 1 892 927 36
## 11 chr1 15834 15905 72.23 5200.7 1 984 1055 72
## 14 chr1 16921 17038 35.63 4203.9 1 1320 1437 118
## 24 chr1 19036 19066 44.15 1368.7 1 2165 2195 31
## 25 chr1 19738 19771 35.28 1199.4 1 2204 2237 34
## 26 chr1 24747 24786 42.61 1704.5 1 6265 6304 40
## 30 chr1 135068 135111 44.28 1948.4 1 10364 10407 44
## 35 chr1 135737 135759 33.28 765.3 1 11033 11055 23
## 36 chr1 135761 135777 33.88 576.0 1 11057 11073 17
## 47 chr1 137660 137679 33.60 671.9 1 12311 12330 20
## 54 chr1 138056 138083 38.50 1078.1 1 12683 12710 28
## 68 chr1 139387 139425 36.93 1440.3 1 13917 13955 39
## 81 chr1 324706 324747 33.67 1414.3 1 21897 21938 42
## 86 chr1 325740 325784 32.62 1467.7 1 22863 22907 45
## 97 chr1 327577 327592 34.01 544.2 1 24107 24122 16
## 98 chr1 327595 327613 34.21 650.0 1 24125 24143 19
## 106 chr1 328243 328284 46.96 1972.2 1 24771 24812 42
## 125 chr1 565244 565288 33.28 1497.5 1 29245 29289 45
## clusterL pvalues qvalues significant significantQval name
## 3 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 7 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 8 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 11 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 14 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 24 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 25 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 26 208851 0.001081 0.004675 TRUE TRUE WASH7P
## 30 208851 0.001081 0.004675 TRUE TRUE LOC729737
## 35 208851 0.001081 0.004675 TRUE TRUE LOC729737
## 36 208851 0.001081 0.004675 TRUE TRUE LOC729737
## 47 208851 0.001081 0.004675 TRUE TRUE LOC729737
## 54 208851 0.001081 0.004675 TRUE TRUE LOC729737
## 68 208851 0.001081 0.004675 TRUE TRUE LOC729737
## 81 208851 0.001081 0.004675 TRUE TRUE LOC100133331
## 86 208851 0.001081 0.004675 TRUE TRUE LOC100133331
## 97 208851 0.001081 0.004675 TRUE TRUE LOC100132287
## 98 208851 0.001081 0.004675 TRUE TRUE LOC100132287
## 106 208851 0.001081 0.004675 TRUE TRUE LOC100132287
## 125 208851 0.001081 0.004675 TRUE TRUE OR4F29
## annotation description region distance
## 3 NR_024540 <NA> <NA> 0
## 7 NR_024540 <NA> <NA> 0
## 8 NR_024540 <NA> <NA> 0
## 11 NR_024540 <NA> <NA> 0
## 14 NR_024540 <NA> <NA> 0
## 24 NR_024540 <NA> <NA> 0
## 25 NR_024540 <NA> <NA> 0
## 26 NR_024540 <NA> <NA> 0
## 30 NR_039983 <NA> <NA> 0
## 35 NR_039983 <NA> <NA> 0
## 36 NR_039983 <NA> <NA> 0
## 47 NR_039983 <NA> <NA> 0
## 54 NR_039983 <NA> <NA> 0
## 68 NR_039983 <NA> <NA> 0
## 81 NR_028327 overlaps exon upstream inside 418
## 86 NR_028327 inside exon inside 1452
## 97 NR_028322 inside exon inside 3685
## 98 NR_028322 inside exon inside 3703
## 106 NR_028322 inside exon inside 4351
## 125 <NA> downstream downstream 56746
## subregion insidedistance exonnumber nexons
## 3 <NA> NA NA 7
## 7 <NA> NA NA 7
## 8 <NA> NA NA 7
## 11 <NA> NA NA 5
## 14 <NA> NA NA 7
## 24 <NA> NA NA 7
## 25 <NA> NA NA 7
## 26 <NA> NA NA 7
## 30 <NA> NA NA 5
## 35 <NA> NA NA 5
## 36 <NA> NA NA 5
## 47 <NA> NA NA 5
## 54 <NA> NA NA 5
## 68 <NA> NA NA 5
## 81 overlaps exon upstream 0 3 4
## 86 inside exon 0 4 4
## 97 inside exon 0 3 3
## 98 inside exon 0 3 3
## 106 inside exon 0 3 3
## 125 <NA> NA NA 1
## UTR strand geneL codingL chrnum pvaluesPool
## 3 <NA> - 0 0 1 0.0001112
## 7 <NA> - 0 0 1 0.0001112
## 8 <NA> - 0 0 1 0.0001112
## 11 <NA> - 0 0 1 0.0001112
## 14 <NA> - 0 0 1 0.0001112
## 24 <NA> - 0 0 1 0.0001112
## 25 <NA> - 0 0 1 0.0001112
## 26 <NA> - 0 0 1 0.0001112
## 30 <NA> - 0 0 1 0.0001112
## 35 <NA> - 0 0 1 0.0001112
## 36 <NA> - 0 0 1 0.0001112
## 47 <NA> - 0 0 1 0.0001112
## 54 <NA> - 0 0 1 0.0001112
## 68 <NA> - 0 0 1 0.0001112
## 81 inside transcription region + 1608 1090 1 0.0001112
## 86 3'UTR + 1608 1090 1 0.0001112
## 97 3'UTR + 4689 1262 1 0.0001112
## 98 3'UTR + 4689 1262 1 0.0001112
## 106 3'UTR + 4689 1262 1 0.0001112
## 125 <NA> - 938 938 1 0.0001112
## penArea1 penArea2
## 3 1490.9 1439.6
## 7 1227.5 1187.9
## 8 1248.2 1222.9
## 11 2635.1 2436.9
## 14 1403.0 1228.3
## 24 1368.7 1324.5
## 25 1128.9 1107.2
## 26 1413.1 1376.5
## 30 1494.2 1445.3
## 35 765.3 765.3
## 36 576.0 576.0
## 47 671.9 671.9
## 54 1078.1 1078.1
## 68 1218.7 1189.1
## 81 1126.8 1093.9
## 86 1104.8 1066.8
## 97 544.2 544.2
## 98 650.0 650.0
## 106 1571.4 1525.4
## 125 1127.2 1088.4
Below are the best 20 regions ordered by p-value (pooled ones). There are 42273 regions with pooled p-value less than 0.05 and 11525 that also have a length greater than 10 base-pairs.
bestPooled <- head(order(fullRegsAnno$pvaluesPool), 20)
## Genome plots
for (idx in bestPooled) {
regionPlot(idx)
}
## Output detail
fullRegsAnno[bestPooled, ]
## chr start end value area cluster indexStart indexEnd L clusterL
## 1 chr1 14496 14496 21.24 21.24 1 543 543 1 208851
## 2 chr1 14498 14520 27.41 630.54 1 545 567 23 208851
## 3 chr1 14524 14568 44.01 1980.61 1 571 615 45 208851
## 4 chr1 14613 14618 23.65 141.90 1 660 665 6 208851
## 5 chr1 14620 14621 22.88 45.76 1 667 668 2 208851
## 6 chr1 14628 14628 21.16 21.16 1 675 675 1 208851
## 7 chr1 14652 14682 39.60 1227.45 1 699 729 31 208851
## 8 chr1 14985 15020 38.47 1384.78 1 892 927 36 208851
## 9 chr1 15827 15828 23.98 47.97 1 977 978 2 208851
## 10 chr1 15832 15832 21.57 21.57 1 982 982 1 208851
## 11 chr1 15834 15905 72.23 5200.70 1 984 1055 72 208851
## 12 chr1 15908 15914 28.71 200.95 1 1058 1064 7 208851
## 13 chr1 15919 15922 26.07 104.29 1 1069 1072 4 208851
## 14 chr1 16921 17038 35.63 4203.90 1 1320 1437 118 208851
## 15 chr1 17294 17294 21.35 21.35 1 1516 1516 1 208851
## 16 chr1 17636 17637 21.53 43.06 1 1661 1662 2 208851
## 17 chr1 17648 17656 22.29 200.65 1 1673 1681 9 208851
## 18 chr1 17706 17747 31.36 1317.19 1 1731 1772 42 208851
## 19 chr1 17930 17936 23.85 166.94 1 1783 1789 7 208851
## 20 chr1 18281 18284 24.32 97.27 1 1928 1931 4 208851
## pvalues qvalues significant significantQval name annotation
## 1 0.762162 0.397218 FALSE FALSE WASH7P NR_024540
## 2 0.032432 0.069893 TRUE TRUE WASH7P NR_024540
## 3 0.001081 0.004675 TRUE TRUE WASH7P NR_024540
## 4 0.202162 0.205469 FALSE FALSE WASH7P NR_024540
## 5 0.314595 0.265980 FALSE FALSE WASH7P NR_024540
## 6 0.795676 0.408462 FALSE FALSE WASH7P NR_024540
## 7 0.001081 0.004675 TRUE TRUE WASH7P NR_024540
## 8 0.001081 0.004675 TRUE TRUE WASH7P NR_024540
## 9 0.160000 0.177187 FALSE FALSE WASH7P NR_024540
## 10 0.642162 0.373408 FALSE FALSE WASH7P NR_024540
## 11 0.001081 0.004675 TRUE TRUE WASH7P NR_024540
## 12 0.015135 0.040782 TRUE TRUE WASH7P NR_024540
## 13 0.061622 0.104660 FALSE FALSE WASH7P NR_024540
## 14 0.001081 0.004675 TRUE TRUE WASH7P NR_024540
## 15 0.722162 0.391637 FALSE FALSE WASH7P NR_024540
## 16 0.648649 0.373408 FALSE FALSE WASH7P NR_024540
## 17 0.422703 0.305249 FALSE FALSE WASH7P NR_024540
## 18 0.003243 0.011858 TRUE TRUE WASH7P NR_024540
## 19 0.177297 0.189232 FALSE FALSE WASH7P NR_024540
## 20 0.141622 0.168419 FALSE FALSE WASH7P NR_024540
## description region distance subregion insidedistance exonnumber nexons
## 1 <NA> <NA> 0 <NA> NA NA 7
## 2 <NA> <NA> 0 <NA> NA NA 7
## 3 <NA> <NA> 0 <NA> NA NA 7
## 4 <NA> <NA> 0 <NA> NA NA 7
## 5 <NA> <NA> 0 <NA> NA NA 7
## 6 <NA> <NA> 0 <NA> NA NA 7
## 7 <NA> <NA> 0 <NA> NA NA 7
## 8 <NA> <NA> 0 <NA> NA NA 7
## 9 <NA> <NA> 0 <NA> NA NA 5
## 10 <NA> <NA> 0 <NA> NA NA 5
## 11 <NA> <NA> 0 <NA> NA NA 5
## 12 <NA> <NA> 0 <NA> NA NA 5
## 13 <NA> <NA> 0 <NA> NA NA 5
## 14 <NA> <NA> 0 <NA> NA NA 7
## 15 <NA> <NA> 0 <NA> NA NA 4
## 16 <NA> <NA> 0 <NA> NA NA 7
## 17 <NA> <NA> 0 <NA> NA NA 7
## 18 <NA> <NA> 0 <NA> NA NA 7
## 19 <NA> <NA> 0 <NA> NA NA 7
## 20 <NA> <NA> 0 <NA> NA NA 7
## UTR strand geneL codingL chrnum pvaluesPool penArea1 penArea2
## 1 <NA> - 0 0 1 0.0001112 21.24 21.24
## 2 <NA> - 0 0 1 0.0001112 630.54 630.54
## 3 <NA> - 0 0 1 0.0001112 1490.87 1439.60
## 4 <NA> - 0 0 1 0.0001112 141.90 141.90
## 5 <NA> - 0 0 1 0.0001112 45.76 45.76
## 6 <NA> - 0 0 1 0.0001112 21.16 21.16
## 7 <NA> - 0 0 1 0.0001112 1227.45 1187.86
## 8 <NA> - 0 0 1 0.0001112 1248.21 1222.91
## 9 <NA> - 0 0 1 0.0001112 47.97 47.97
## 10 <NA> - 0 0 1 0.0001112 21.57 21.57
## 11 <NA> - 0 0 1 0.0001112 2635.07 2436.94
## 12 <NA> - 0 0 1 0.0001112 200.95 200.95
## 13 <NA> - 0 0 1 0.0001112 104.29 104.29
## 14 <NA> - 0 0 1 0.0001112 1402.99 1228.30
## 15 <NA> - 0 0 1 0.0001112 21.35 21.35
## 16 <NA> - 0 0 1 0.0001112 43.06 43.06
## 17 <NA> - 0 0 1 0.0001112 200.65 200.65
## 18 <NA> - 0 0 1 0.0001112 1049.49 1018.78
## 19 <NA> - 0 0 1 0.0001112 166.94 166.94
## 20 <NA> - 0 0 1 0.0001112 97.27 97.27
Below are the best 20 regions ordered by area.
bestArea <- head(order(fullRegsAnno$area, decreasing = TRUE), 20)
## Genome plots
for (idx in bestArea) {
regionPlot(idx)
}
## Output detail
fullRegsAnno[bestArea, ]
## chr start end value area cluster indexStart indexEnd
## 41217 chrX 73065732 73066652 72.16 66462 23 930786 931706
## 41144 chrX 73046528 73047108 93.62 54392 23 925212 925792
## 41208 chrX 73064868 73065420 85.09 47057 23 929947 930499
## 41171 chrX 73062356 73062878 62.27 32567 23 927741 928263
## 41157 chrX 73048956 73049275 66.57 21303 23 926401 926720
## 41287 chrX 73071466 73071782 63.78 20217 23 934813 935129
## 25909 chr14 106203213 106203756 33.66 18313 18 1576583 1577126
## 28988 chr16 22547417 22547711 59.31 17496 1 985283 985577
## 41138 chrX 73045850 73046143 57.42 16881 23 924534 924827
## 41063 chrX 73041843 73042153 49.20 15303 23 921481 921791
## 16255 chr8 57014691 57014750 241.65 14499 9 764313 764372
## 11848 chr6 5973899 5973953 255.45 14050 1 92967 93021
## 12335 chr6 31324267 31324427 87.06 14017 3 593808 593968
## 26238 chr14 106405612 106406098 28.11 13689 18 1633317 1633803
## 29301 chr16 29392794 29392970 71.06 12577 1 1224459 1224635
## 26063 chr14 106323037 106323474 28.54 12500 18 1603520 1603957
## 41290 chrX 73072026 73072218 64.48 12445 23 935198 935390
## 41158 chrX 73053073 73053251 66.88 11972 23 926777 926955
## 24432 chr13 68406742 68406793 230.18 11969 14 562098 562149
## 25908 chr14 106202855 106203208 33.64 11910 18 1576225 1576578
## L clusterL pvalues qvalues significant significantQval
## 41217 921 170283 0.003521 0.01477 TRUE TRUE
## 41144 581 170283 0.003521 0.01477 TRUE TRUE
## 41208 553 170283 0.003521 0.01477 TRUE TRUE
## 41171 523 170283 0.003521 0.01477 TRUE TRUE
## 41157 320 170283 0.003521 0.01477 TRUE TRUE
## 41287 317 170283 0.003521 0.01477 TRUE TRUE
## 25909 544 298055 0.010959 0.08090 TRUE TRUE
## 28988 295 1577510 0.003077 0.01962 TRUE TRUE
## 41138 294 170283 0.003521 0.01477 TRUE TRUE
## 41063 311 170283 0.003521 0.01477 TRUE TRUE
## 16255 60 80903 0.003597 0.01894 TRUE TRUE
## 11848 55 189132 0.002398 0.02654 TRUE TRUE
## 12335 161 729134 0.002398 0.02654 TRUE TRUE
## 26238 487 298055 0.035616 0.12191 TRUE FALSE
## 29301 177 1577510 0.003077 0.01962 TRUE TRUE
## 26063 438 298055 0.032877 0.12191 TRUE FALSE
## 41290 193 170283 0.003521 0.01477 TRUE TRUE
## 41158 179 170283 0.003521 0.01477 TRUE TRUE
## 24432 52 2579 0.009174 0.03720 TRUE TRUE
## 25908 354 298055 0.010959 0.08090 TRUE TRUE
## name annotation description region distance
## 41217 XIST NR_001564 <NA> <NA> 0
## 41144 TSIX NR_003255 <NA> <NA> 0
## 41208 XIST NR_001564 <NA> <NA> 0
## 41171 XIST NR_001564 <NA> <NA> 0
## 41157 TSIX NR_003255 <NA> <NA> 0
## 41287 XIST NR_001564 <NA> <NA> 0
## 25909 KIAA0125 NM_001786 upstream upstream 152224
## 28988 LOC100132247 <NA> inside exon inside 4812
## 41138 TSIX NR_003255 <NA> <NA> 0
## 41063 TSIX NR_003255 <NA> <NA> 0
## 16255 MOS NP_001663 downstream downstream 11791
## 11848 NRN1 <NA> <NA> <NA> 0
## 12335 HLA-B <NA> inside intron inside 307
## 26238 KIAA0125 NM_001786 <NA> <NA> 0
## 29301 SNX29P2 NR_002939 <NA> <NA> 0
## 26063 KIAA0125 NM_001786 upstream upstream 32506
## 41290 XIST NR_001564 <NA> <NA> 0
## 41158 XIST NR_001564 <NA> <NA> 0
## 24432 PCDH9 <NA> upstream upstream 602274
## 25908 KIAA0125 NM_001786 upstream upstream 152772
## subregion insidedistance exonnumber nexons
## 41217 <NA> NA NA 6
## 41144 <NA> NA NA 1
## 41208 <NA> NA NA 6
## 41171 <NA> NA NA 6
## 41157 <NA> NA NA 1
## 41287 <NA> NA NA 6
## 25909 <NA> NA NA 4
## 28988 inside exon 0 4 4
## 41138 <NA> NA NA 1
## 41063 <NA> NA NA 1
## 16255 <NA> NA NA 1
## 11848 <NA> NA NA 3
## 12335 inside intron -38 1 2
## 26238 <NA> NA NA 6
## 29301 <NA> NA NA 7
## 26063 <NA> NA NA 4
## 41290 <NA> NA NA 6
## 41158 <NA> NA NA 6
## 24432 <NA> NA NA 5
## 25908 <NA> NA NA 4
## UTR strand geneL codingL chrnum pvaluesPool
## 41217 <NA> - 0 0 X 0.0001112
## 41144 <NA> + 0 0 X 0.0001112
## 41208 <NA> - 0 0 X 0.0001112
## 41171 <NA> - 0 0 X 0.0001112
## 41157 <NA> + 0 0 X 0.0001112
## 41287 <NA> - 0 0 X 0.0001112
## 25909 <NA> + 32550 1546 14 0.0001112
## 28988 overlaps 3'UTR + 5236 4321 16 0.0001112
## 41138 <NA> + 0 0 X 0.0001112
## 41063 <NA> + 0 0 X 0.0001112
## 16255 <NA> - 1040 1040 8 0.0001112
## 11848 <NA> - 0 0 6 0.0001112
## 12335 inside transcription region - 1435 871 6 0.0001112
## 26238 <NA> + 0 0 14 0.0001112
## 29301 <NA> + 0 0 16 0.0001112
## 26063 <NA> + 32550 1546 14 0.0001112
## 41290 <NA> - 0 0 X 0.0001112
## 41158 <NA> - 0 0 X 0.0001112
## 24432 <NA> - 927502 923785 13 0.0001112
## 25908 <NA> + 32550 1546 14 0.0001112
## penArea1 penArea2
## 41217 4319 2655
## 41144 5006 3399
## 41208 4499 3085
## 41171 3251 2254
## 41157 3131 2375
## 41287 2994 2274
## 25909 1773 1220
## 28988 2745 2110
## 41138 2655 2043
## 41063 2301 1754
## 16255 8573 8071
## 11848 8941 8486
## 12335 3608 3036
## 26238 1444 1015
## 29301 2993 2486
## 26063 1433 1028
## 41290 2758 2263
## 41158 2823 2341
## 24432 7985 7617
## 25908 1615 1204
Below are the best 20 regions ordered by penalized area (scheme 1).
bestPen1Area <- head(order(fullRegsAnno$penArea1, decreasing = TRUE), 20)
## Genome plots
for (idx in bestPen1Area) {
regionPlot(idx)
}
## Output detail
fullRegsAnno[bestPen1Area, ]
## chr start end value area cluster indexStart indexEnd L
## 11848 chr6 5973899 5973953 255.5 14050 1 92967 93021 55
## 16255 chr8 57014691 57014750 241.6 14499 9 764313 764372 60
## 24432 chr13 68406742 68406793 230.2 11969 14 562098 562149 52
## 24418 chr13 61211942 61211988 220.2 10348 8 548916 548962 47
## 19718 chr10 126556165 126556216 209.1 10875 20 1895727 1895778 52
## 8472 chr3 183833902 183833952 180.2 9189 53 2466186 2466236 51
## 16302 chr8 70899787 70899844 166.0 9629 12 899205 899262 58
## 2197 chr1 115079805 115079858 160.1 8644 28 2598156 2598209 54
## 37532 chr20 21095583 21095643 154.4 9418 5 325744 325804 61
## 4746 chr2 61644256 61644310 155.9 8573 21 885719 885773 55
## 3474 chr1 201947862 201947913 156.8 8155 48 4134040 4134091 52
## 3429 chr1 187532975 187533029 154.8 8514 41 4006009 4006063 55
## 34164 chr18 18699328 18699381 152.0 8206 4 225839 225892 54
## 16537 chr8 102381423 102381469 155.3 7301 21 1219137 1219183 47
## 3415 chr1 185196775 185196827 151.1 8008 41 3985494 3985546 53
## 14430 chr7 63602128 63602187 147.7 8862 12 908720 908779 60
## 27207 chr15 79155773 79155827 149.4 8218 9 1301187 1301241 55
## 13014 chr6 56736096 56736160 145.3 9444 11 1364846 1364910 65
## 3039 chr1 158495473 158495525 149.3 7914 37 3451365 3451417 53
## 7269 chr3 48162537 48162582 150.5 6922 15 731985 732030 46
## clusterL pvalues qvalues significant significantQval name
## 11848 189132 0.002398 0.026542 TRUE TRUE NRN1
## 16255 80903 0.003597 0.018939 TRUE TRUE MOS
## 24432 2579 0.009174 0.037202 TRUE TRUE PCDH9
## 24418 18195 0.009174 0.037202 TRUE TRUE TDRD3
## 19718 42428 0.003367 0.034869 TRUE TRUE FAM175B
## 8472 485196 0.002141 0.026714 TRUE TRUE HTR3E
## 16302 88048 0.003597 0.018939 TRUE TRUE PRDM14
## 2197 85499 0.001081 0.004675 TRUE TRUE DENND2C
## 37532 98566 0.004444 0.030002 TRUE TRUE PLK1S1
## 4746 153251 0.002053 0.015686 TRUE TRUE USP34
## 3474 195658 0.001081 0.004675 TRUE TRUE RNPEP
## 3429 185407 0.001081 0.004675 TRUE TRUE PLA2G4A
## 34164 79962 0.006711 0.016171 TRUE TRUE ROCK1
## 16537 206310 0.003597 0.018939 TRUE TRUE NACAP1
## 3415 185407 0.001081 0.004675 TRUE TRUE SWT1
## 14430 554568 0.001208 0.006552 TRUE TRUE ZNF727
## 27207 44548 0.002778 0.025589 TRUE TRUE MORF4L1
## 13014 88851 0.002398 0.026542 TRUE TRUE DST
## 3039 710950 0.001081 0.004675 TRUE TRUE OR6Y1
## 7269 638180 0.002141 0.026714 TRUE TRUE MAP4
## annotation description region distance subregion
## 11848 <NA> <NA> <NA> 0 <NA>
## 16255 NP_001663 downstream downstream 11791 <NA>
## 24432 <NA> upstream upstream 602274 <NA>
## 24418 <NA> downstream downstream 241351 <NA>
## 19718 NM_001164318 downstream downstream 65811 <NA>
## 8472 <NA> downstream downstream 19050 <NA>
## 16302 NP_001034662 downstream downstream 83718 <NA>
## 2197 <NA> <NA> <NA> 0 <NA>
## 37532 <NA> upstream upstream 10981 <NA>
## 4746 <NA> inside intron inside 53539 inside intron
## 3474 NM_001024808 upstream upstream 3853 <NA>
## 3429 <NA> downstream downstream 734943 <NA>
## 34164 <NA> upstream upstream 7516 <NA>
## 16537 <NA> inside exon inside 302 inside exon
## 3415 <NA> inside intron inside 70484 inside intron
## 14430 <NA> downstream downstream 96307 <NA>
## 27207 <NA> upstream upstream 9345 <NA>
## 13014 <NA> upstream upstream 19382 <NA>
## 3039 <NA> downstream downstream 22370 <NA>
## 7269 <NA> upstream upstream 31768 <NA>
## insidedistance exonnumber nexons UTR strand
## 11848 NA NA 3 <NA> -
## 16255 NA NA 1 <NA> -
## 24432 NA NA 5 <NA> -
## 24418 NA NA 14 <NA> +
## 19718 NA NA 9 <NA> +
## 8472 NA NA 9 <NA> +
## 16302 NA NA 8 <NA> -
## 2197 NA NA 30 <NA> -
## 37532 NA NA 14 <NA> +
## 4746 -3571 2 80 inside transcription region -
## 3474 NA NA 11 <NA> +
## 3429 NA NA 18 <NA> +
## 34164 NA NA 33 <NA> -
## 16537 0 1 1 inside transcription region +
## 3415 3882 16 19 inside transcription region +
## 14430 NA NA 4 <NA> +
## 27207 NA NA 6 <NA> +
## 13014 NA NA 34 <NA> -
## 3039 NA NA 1 <NA> -
## 7269 NA NA 19 <NA> -
## geneL codingL chrnum pvaluesPool penArea1 penArea2
## 11848 0 0 6 0.0001112 8941 8486
## 16255 1040 1040 8 0.0001112 8573 8071
## 24432 927502 923785 13 0.0001112 7985 7617
## 24418 177422 107174 13 0.0001112 7513 7229
## 19718 34885 33141 10 0.0001112 7255 6920
## 8472 9931 9163 3 0.0001112 6231 5954
## 16302 19676 17783 8 0.0001112 5859 5534
## 2197 0 0 1 0.0001112 5586 5311
## 37532 120634 120457 20 0.0001112 5492 5162
## 4746 283259 282590 2 0.0001112 5455 5178
## 3474 23509 23049 1 0.0001112 5440 5189
## 3429 160081 134153 1 0.0001112 5418 5143
## 34164 162109 159526 18 0.0001112 5303 5042
## 16537 702 407 8 0.0001112 5301 5101
## 3415 134622 129961 1 0.0001112 5258 5007
## 14430 33106 32927 7 0.0001112 5240 4933
## 27207 18875 18112 15 0.0001112 5230 4964
## 13014 237560 237434 6 0.0001112 5218 4875
## 3039 977 977 1 0.0001112 5196 4948
## 7269 238589 145828 3 0.0001112 5116 4932
Below are the best 20 regions ordered by penalized area (scheme 2).
bestPen2Area <- head(order(fullRegsAnno$penArea2, decreasing = TRUE), 20)
## Genome plots
for (idx in bestPen2Area) {
regionPlot(idx)
}
## Output detail
fullRegsAnno[bestPen2Area, ]
## chr start end value area cluster indexStart indexEnd L
## 11848 chr6 5973899 5973953 255.5 14050 1 92967 93021 55
## 16255 chr8 57014691 57014750 241.6 14499 9 764313 764372 60
## 24432 chr13 68406742 68406793 230.2 11969 14 562098 562149 52
## 24418 chr13 61211942 61211988 220.2 10348 8 548916 548962 47
## 19718 chr10 126556165 126556216 209.1 10875 20 1895727 1895778 52
## 8472 chr3 183833902 183833952 180.2 9189 53 2466186 2466236 51
## 16302 chr8 70899787 70899844 166.0 9629 12 899205 899262 58
## 2197 chr1 115079805 115079858 160.1 8644 28 2598156 2598209 54
## 3474 chr1 201947862 201947913 156.8 8155 48 4134040 4134091 52
## 4746 chr2 61644256 61644310 155.9 8573 21 885719 885773 55
## 37532 chr20 21095583 21095643 154.4 9418 5 325744 325804 61
## 3429 chr1 187532975 187533029 154.8 8514 41 4006009 4006063 55
## 16537 chr8 102381423 102381469 155.3 7301 21 1219137 1219183 47
## 34164 chr18 18699328 18699381 152.0 8206 4 225839 225892 54
## 3415 chr1 185196775 185196827 151.1 8008 41 3985494 3985546 53
## 27207 chr15 79155773 79155827 149.4 8218 9 1301187 1301241 55
## 3039 chr1 158495473 158495525 149.3 7914 37 3451365 3451417 53
## 14430 chr7 63602128 63602187 147.7 8862 12 908720 908779 60
## 7269 chr3 48162537 48162582 150.5 6922 15 731985 732030 46
## 13014 chr6 56736096 56736160 145.3 9444 11 1364846 1364910 65
## clusterL pvalues qvalues significant significantQval name
## 11848 189132 0.002398 0.026542 TRUE TRUE NRN1
## 16255 80903 0.003597 0.018939 TRUE TRUE MOS
## 24432 2579 0.009174 0.037202 TRUE TRUE PCDH9
## 24418 18195 0.009174 0.037202 TRUE TRUE TDRD3
## 19718 42428 0.003367 0.034869 TRUE TRUE FAM175B
## 8472 485196 0.002141 0.026714 TRUE TRUE HTR3E
## 16302 88048 0.003597 0.018939 TRUE TRUE PRDM14
## 2197 85499 0.001081 0.004675 TRUE TRUE DENND2C
## 3474 195658 0.001081 0.004675 TRUE TRUE RNPEP
## 4746 153251 0.002053 0.015686 TRUE TRUE USP34
## 37532 98566 0.004444 0.030002 TRUE TRUE PLK1S1
## 3429 185407 0.001081 0.004675 TRUE TRUE PLA2G4A
## 16537 206310 0.003597 0.018939 TRUE TRUE NACAP1
## 34164 79962 0.006711 0.016171 TRUE TRUE ROCK1
## 3415 185407 0.001081 0.004675 TRUE TRUE SWT1
## 27207 44548 0.002778 0.025589 TRUE TRUE MORF4L1
## 3039 710950 0.001081 0.004675 TRUE TRUE OR6Y1
## 14430 554568 0.001208 0.006552 TRUE TRUE ZNF727
## 7269 638180 0.002141 0.026714 TRUE TRUE MAP4
## 13014 88851 0.002398 0.026542 TRUE TRUE DST
## annotation description region distance subregion
## 11848 <NA> <NA> <NA> 0 <NA>
## 16255 NP_001663 downstream downstream 11791 <NA>
## 24432 <NA> upstream upstream 602274 <NA>
## 24418 <NA> downstream downstream 241351 <NA>
## 19718 NM_001164318 downstream downstream 65811 <NA>
## 8472 <NA> downstream downstream 19050 <NA>
## 16302 NP_001034662 downstream downstream 83718 <NA>
## 2197 <NA> <NA> <NA> 0 <NA>
## 3474 NM_001024808 upstream upstream 3853 <NA>
## 4746 <NA> inside intron inside 53539 inside intron
## 37532 <NA> upstream upstream 10981 <NA>
## 3429 <NA> downstream downstream 734943 <NA>
## 16537 <NA> inside exon inside 302 inside exon
## 34164 <NA> upstream upstream 7516 <NA>
## 3415 <NA> inside intron inside 70484 inside intron
## 27207 <NA> upstream upstream 9345 <NA>
## 3039 <NA> downstream downstream 22370 <NA>
## 14430 <NA> downstream downstream 96307 <NA>
## 7269 <NA> upstream upstream 31768 <NA>
## 13014 <NA> upstream upstream 19382 <NA>
## insidedistance exonnumber nexons UTR strand
## 11848 NA NA 3 <NA> -
## 16255 NA NA 1 <NA> -
## 24432 NA NA 5 <NA> -
## 24418 NA NA 14 <NA> +
## 19718 NA NA 9 <NA> +
## 8472 NA NA 9 <NA> +
## 16302 NA NA 8 <NA> -
## 2197 NA NA 30 <NA> -
## 3474 NA NA 11 <NA> +
## 4746 -3571 2 80 inside transcription region -
## 37532 NA NA 14 <NA> +
## 3429 NA NA 18 <NA> +
## 16537 0 1 1 inside transcription region +
## 34164 NA NA 33 <NA> -
## 3415 3882 16 19 inside transcription region +
## 27207 NA NA 6 <NA> +
## 3039 NA NA 1 <NA> -
## 14430 NA NA 4 <NA> +
## 7269 NA NA 19 <NA> -
## 13014 NA NA 34 <NA> -
## geneL codingL chrnum pvaluesPool penArea1 penArea2
## 11848 0 0 6 0.0001112 8941 8486
## 16255 1040 1040 8 0.0001112 8573 8071
## 24432 927502 923785 13 0.0001112 7985 7617
## 24418 177422 107174 13 0.0001112 7513 7229
## 19718 34885 33141 10 0.0001112 7255 6920
## 8472 9931 9163 3 0.0001112 6231 5954
## 16302 19676 17783 8 0.0001112 5859 5534
## 2197 0 0 1 0.0001112 5586 5311
## 3474 23509 23049 1 0.0001112 5440 5189
## 4746 283259 282590 2 0.0001112 5455 5178
## 37532 120634 120457 20 0.0001112 5492 5162
## 3429 160081 134153 1 0.0001112 5418 5143
## 16537 702 407 8 0.0001112 5301 5101
## 34164 162109 159526 18 0.0001112 5303 5042
## 3415 134622 129961 1 0.0001112 5258 5007
## 27207 18875 18112 15 0.0001112 5230 4964
## 3039 977 977 1 0.0001112 5196 4948
## 14430 33106 32927 7 0.0001112 5240 4933
## 7269 238589 145828 3 0.0001112 5116 4932
## 13014 237560 237434 6 0.0001112 5218 4875
## Process the time info
time <- lapply(fullTime, function(x) data.frame(diff(x)))
time <- do.call(rbind, time)
colnames(time) <- "sec"
time$sec <- as.integer(round(time$sec))
time$min <- time$sec/60
time$chr <- paste0("chr", gsub("\\..*", "", rownames(time)))
time$step <- gsub(".*\\.", "", rownames(time))
rownames(time) <- seq_len(nrow(time))
## Make plot
ggplot(time, aes(x = step, y = min, colour = chr)) + geom_point() + labs(title = "Wallclock time by step") +
scale_colour_discrete(limits = chrs) + scale_x_discrete(limits = names(fullTime[[1]])[-1]) +
ylab("Time (min)") + xlab("Step")
This plot shows the wallclock time spent in each of the derfinder2 analysis steps.
Below is the information on how the samples were permutted when performing the permutations.
## Get the permutation information
nSamples <- seq_len(length(optionsStats$group))
seeds <- 1:10
permuteInfo <- lapply(seeds, function(x) {
set.seed(x)
idx <- sample(nSamples)
data.frame(optionsStats$group[idx])
})
permuteInfo <- cbind(data.frame(optionsStats$group), do.call(cbind, permuteInfo))
colnames(permuteInfo) <- c("original", paste0("perm", 1:10))
## The raw information permuteInfo
n <- names(table(permuteInfo[, 2]))
permuteDetail <- data.frame(matrix(NA, nrow = 10 * length(n), ncol = 2 + length(n)))
permuteDetail[, 1] <- rep(1:10, each = length(n))
permuteDetail[, 2] <- rep(n, 10)
colnames(permuteDetail) <- c("permutation", "group", as.character(n))
l <- 1
m <- 3:ncol(permuteDetail)
for (j in n) {
k <- which(permuteInfo[, 1] == j)
for (i in 2:11) {
permuteDetail[l, m] <- table(permuteInfo[k, i])
l <- l + 1
}
}
permuteDetail
## permutation group CEU YRI
## 1 1 CEU 16 5
## 2 1 YRI 15 6
## 3 2 CEU 15 6
## 4 2 YRI 14 7
## 5 3 CEU 14 7
## 6 3 YRI 12 9
## 7 4 CEU 14 7
## 8 4 YRI 14 7
## 9 5 CEU 14 7
## 10 5 YRI 15 6
## 11 6 CEU 5 5
## 12 6 YRI 6 4
## 13 7 CEU 6 4
## 14 7 YRI 7 3
## 15 8 CEU 7 3
## 16 8 YRI 9 1
## 17 9 CEU 7 3
## 18 9 YRI 7 3
## 19 10 CEU 7 3
## 20 10 YRI 6 4
This table shows how the group labels were permutted. This can be useful to detect whether a permutation in particular had too many samples of a group labeled as another group, meaning that the resulting permutted group label resulted in pretty much a name change.
summary(permuteDetail[, m])
## CEU YRI
## Min. : 5.0 Min. :1
## 1st Qu.: 7.0 1st Qu.:3
## Median :10.5 Median :5
## Mean :10.5 Mean :5
## 3rd Qu.:14.0 3rd Qu.:7
## Max. :16.0 Max. :9
This table shows the summary per group of the first table and can be used for faster detection of anomalies.
## By index
permuteInfoIdx <- lapply(seeds, function(x) {
set.seed(x)
idx <- sample(nSamples)
data.frame(nSamples[idx])
})
permuteInfoIdx <- cbind(data.frame(nSamples), do.call(cbind, permuteInfoIdx))
colnames(permuteInfoIdx) <- c("original", paste0("perm", 1:10))
## Only if you really want to see the indexes
permuteInfoIdx
Note that in derfinder2 the re-sampling of the samples is done without replacement. This is done in an effort to avoid singular model matrices. While the sample balance is the same across the permutations, what changes are the adjusted variables (including the column medians).
## Save the hard work
if (!procFiles) {
save(fullRegsAnno, file = paste0(prefix, "/fullRegsAnnoPooled.Rdata"))
save(fullNullSummary, file = paste0(prefix, "/fullNullSummary.Rdata"))
}
## Date the report was generated
Sys.time()
## [1] "2013-08-15 23:24:00 EDT"
## Processing time in seconds
proc.time()
## user system elapsed
## 4196.9 162.8 5038.5
## Session info
sessionInfo()
## R version 3.0.1 Patched (2013-05-16 r62754)
## Platform: x86_64-unknown-linux-gnu (64-bit)
##
## locale:
## [1] LC_CTYPE=en_US.iso885915 LC_NUMERIC=C
## [3] LC_TIME=en_US.iso885915 LC_COLLATE=en_US.iso885915
## [5] LC_MONETARY=en_US.iso885915 LC_MESSAGES=en_US.iso885915
## [7] LC_PAPER=C LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.iso885915 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel methods stats graphics grDevices utils
## [8] datasets base
##
## other attached packages:
## [1] rtracklayer_1.20.2
## [2] Cairo_1.5-2
## [3] TxDb.Hsapiens.UCSC.hg19.knownGene_2.9.0
## [4] GenomicFeatures_1.12.1
## [5] AnnotationDbi_1.22.5
## [6] Biobase_2.20.0
## [7] ggbio_1.8.5
## [8] GenomicRanges_1.12.3
## [9] gridExtra_0.9.1
## [10] ggplot2_0.9.3.1
## [11] IRanges_1.18.1
## [12] BiocGenerics_0.6.0
## [13] derfinder2_0.0.13
## [14] RcppArmadillo_0.3.820
## [15] Rcpp_0.10.4
## [16] knitrBootstrap_0.7.0
## [17] markdown_0.6.3
## [18] knitr_1.2
##
## loaded via a namespace (and not attached):
## [1] biomaRt_2.16.0 Biostrings_2.28.0
## [3] biovizBase_1.8.0 bitops_1.0-5
## [5] BSgenome_1.28.0 bumphunter_1.1.11
## [7] cluster_1.14.4 codetools_0.2-8
## [9] colorspace_1.2-2 DBI_0.2-7
## [11] dichromat_2.0-0 digest_0.6.3
## [13] doRNG_1.5.3 evaluate_0.4.3
## [15] foreach_1.4.0 formatR_0.7
## [17] gtable_0.1.2 Hmisc_3.10-1.1
## [19] iterators_1.0.6 itertools_0.1-1
## [21] labeling_0.1 lattice_0.20-15
## [23] MASS_7.3-26 matrixStats_0.8.1
## [25] munsell_0.4 plyr_1.8
## [27] proto_0.3-10 qvalue_1.34.0
## [29] RColorBrewer_1.0-5 RCurl_1.95-4.1
## [31] reshape2_1.2.2 R.methodsS3_1.4.2
## [33] Rsamtools_1.12.3 RSQLite_0.11.3
## [35] scales_0.2.3 stats4_3.0.1
## [37] stringr_0.6.2 tcltk_3.0.1
## [39] tools_3.0.1 VariantAnnotation_1.6.5
## [41] XML_3.96-1.1 zlibbioc_1.6.0
This report written by L. Collado Torres and was generated using knitrBootstrap.
citation("derfinder2")
To cite package 'derfinder2' in publications use:
Leonardo Collado-Torres, Alyssa Frazee, Andrew Jaffe and Jeffrey Leek (2013). derfinder2: Fast differential expression analysis of RNA-seq data at base-pair resolution. R package version 0.0.13. https://github.com/lcolladotor/derfinder2
A BibTeX entry for LaTeX users is
@Manual{, title = {derfinder2: Fast differential expression analysis of RNA-seq data at base-pair resolution}, author = {Leonardo Collado-Torres and Alyssa Frazee and Andrew Jaffe and Jeffrey Leek}, year = {2013}, note = {R package version 0.0.13}, url = {https://github.com/lcolladotor/derfinder2}, }